1 tidy data

UHF_zipcode = 
  read_csv("./data/appb.csv") %>% 
  slice(-43) %>% 
  select(-Borough) %>% 
  rename("UHF" = "UHF Neighborhood") %>% 
  janitor::clean_names()
## Parsed with column specification:
## cols(
##   Borough = col_character(),
##   `UHF Neighborhood` = col_character(),
##   `Zip Code` = col_character()
## )
raw_hiv = 
  GET("https://data.cityofnewyork.us/api/views/fju2-rdad/rows.csv") %>% 
  content("parsed") %>% 
  janitor::clean_names()
## Parsed with column specification:
## cols(
##   Year = col_integer(),
##   Borough = col_character(),
##   UHF = col_character(),
##   Gender = col_character(),
##   Age = col_character(),
##   Race = col_character(),
##   `HIV diagnoses` = col_integer(),
##   `HIV diagnosis rate` = col_double(),
##   `Concurrent diagnoses` = col_integer(),
##   `% linked to care within 3 months` = col_integer(),
##   `AIDS diagnoses` = col_integer(),
##   `AIDS diagnosis rate` = col_double(),
##   `PLWDHI prevalence` = col_double(),
##   `% viral suppression` = col_integer(),
##   Deaths = col_integer(),
##   `Death rate` = col_double(),
##   `HIV-related death rate` = col_double(),
##   `Non-HIV-related death rate` = col_double()
## )
combine_hiv = 
  right_join(UHF_zipcode, raw_hiv, by = "uhf") %>%
  janitor::clean_names() %>% 
  separate(zip_code, into = c("zipcode1", "zipcode2", "zipcode3", 
                              "zipcode4", "zipcode5", "zipcode6", "zipcode7", "zipcode8",
                              "zipcode9"), sep = ", ") %>% 
  gather(key = zip_code, value = zipcode_value, zipcode1:zipcode9) %>% 
  filter(!is.na(zipcode_value)) %>% 
  rename("zipcode" = "zipcode_value") %>% 
  select(zipcode, everything(), -zip_code)
r = GET('http://data.beta.nyc//dataset/3bf5fb73-edb5-4b05-bb29-7c95f4a727fc/resource/6df127b1-6d04-4bb7-b983-07402a2c3f90/download/f4129d9aa6dd4281bc98d0f701629b76nyczipcodetabulationareas.geojson')
nyc_zipcode = readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
## No encoding supplied: defaulting to UTF-8.
zipcode_lat_lng = nyc_zipcode@data %>% 
  select(zipcode = postalCode, longitude, latitude) %>% 
  mutate(zipcode = as.character(zipcode))
  
combine_hiv1 = 
  full_join(zipcode_lat_lng, combine_hiv, by = "zipcode") %>% 
  mutate(longitude = as.numeric(longitude), latitude = as.numeric(latitude)) %>% 
  group_by(uhf) %>% 
  summarise(lng = mean(longitude),
            lat = mean(latitude)) %>% 
  filter(!(uhf == "Pelhem - Throgs Neck"))
pums_raw = read_csv("./data/selected_pums.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   PUMA00 = col_integer(),
##   PUMA10 = col_integer(),
##   ADJINC = col_integer(),
##   PINCP = col_integer(),
##   RAC3P05 = col_integer(),
##   RAC3P12 = col_integer()
## )
temp = tempfile(fileext = ".xls")

dataURL = "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/nyc_zcta10_to_puma10.xls"

download.file(dataURL, destfile = temp, mode = 'wb')

zcta_to_puma = readxl::read_xls(temp, sheet = 2) %>% 
  select(zcta = zcta10, puma = puma10) %>% 
  mutate(puma = as.numeric(puma))

dataURL =
  "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/zip_to_zcta10_nyc_revised.xls"

download.file(dataURL, destfile = temp, mode = 'wb')
zip_to_zcta = readxl::read_xls(temp, sheet = 2) %>% 
  select(zipcode, zcta = zcta5) 
pums_data = 
  pums_raw %>% 
  select(puma = PUMA10, income = PINCP, year = ADJINC) %>% 
  filter(puma != -9)  # remove data from 2011 due to lack of area information

pums_data$year = recode(pums_data$year, 
                        "1042852" = "2012",
                        "1025215" = "2013",  
                        "1009585" = "2014", 
                        "1001264" = "2015")   

pums_data = 
  pums_data %>% 
  group_by(year, puma) %>% 
  summarise(mid_income = median(income, na.rm = TRUE)) %>% 
  ungroup()         # calculate yearly median income for each area
puma_to_zipcode = right_join(zip_to_zcta, zcta_to_puma, by = "zcta") %>%   # generaate a puma to zipcode file
  select(puma, zipcode)

income_zipcode = right_join(pums_data, puma_to_zipcode, by = "puma") %>%  # matching zipcode with median income data
  select(year, zipcode, mid_income) %>% 
  mutate(year = as.numeric(year))

combine_hiv_income = 
  left_join(combine_hiv, income_zipcode, by = c("year", "zipcode"))

2 visualization

First, import and tidy data:

gender neighborhood VS hiv

neb_plot = hiv_data %>% 
  group_by(neighborhood, gender) %>% 
  filter(year != "ALL", borough != "All", neighborhood != "All", gender != "All") %>% 
  filter(age != "All") %>%
  summarise(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(x = reorder(neighborhood, sum_hiv), y = sum_hiv, color = gender)) + 
  coord_flip() +
  geom_point() +
  labs(
        title = "Gender and Neighborhood Influence on HIV Incidence",
        x = "Neighborhood",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      )

ggplotly(neb_plot)

The number of HIV diagnoses is higher among male than female in all neighborhoods. Beford Stuyvesant - Crown Heights have the most HIV diagnoses for both men and women.

Gender, age vs hiv

  • Omit the data of transgender group, because the data of transgender were not divided into specific age groups.
age_plot = hiv_data %>% 
  filter(race == "All" & borough == "All" & age != "All") %>% 
  group_by(gender, age) %>% 
  summarise(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(y = sum_hiv, x = age, fill = gender)) + 
  geom_bar(stat = "identity", alpha = 0.8, position = position_dodge()) +
  scale_fill_brewer(palette = "Dark2") +
  labs(
        title = "Gender and Age Influence on HIV Incidence",
        x = "Age range",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      ) 

ggplotly(age_plot)

Gender race vs hiv

race_plot = hiv_data %>% 
  filter(age == "All" & borough == "All" & race != "All") %>% 
  group_by(gender, race) %>% 
  summarise(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(y = sum_hiv, x = reorder(race, sum_hiv), fill = gender)) + 
  geom_bar(stat = "identity", alpha = 0.8, position = position_dodge()) +
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  labs(
        title = "Race and Gender Influence on HIV Incidence",
        x = "Race",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      ) 

ggplotly(race_plot)

hiv diagnoses in borough with most hiv over years

hiv_data %>%
  filter(borough != "All", neighborhood == "All", gender == "All", age == "All", race == "All") %>% 
  group_by(borough) %>% 
  summarize(sum_hiv = sum(hiv_diagnoses)) %>% 
  arrange(desc(sum_hiv))
## # A tibble: 5 x 2
##   borough       sum_hiv
##   <chr>           <int>
## 1 Brooklyn         3815
## 2 Manhattan        3536
## 3 Bronx            2736
## 4 Queens           2327
## 5 Staten Island     217
year_plot = hiv_data %>% 
  mutate(year = as.integer(year)) %>% 
  filter(borough == "Brooklyn" & gender == "Male" & age == "20 - 29") %>% 
  group_by(year, neighborhood) %>% 
  summarize(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(x = year, y = sum_hiv, color = neighborhood)) +
  geom_line()
  
ggplotly(year_plot)

Income vs Hiv

income_plot = combine_hiv_income %>% 
  filter(year != 2011, gender == "All", age == "All", race == "All") %>% 
  group_by(uhf) %>%
  summarise(sum_hiv = sum(hiv_diagnoses), mean_income = mean(mid_income)) %>% 
  ggplot(aes(x = mean_income, y = sum_hiv)) + 
  geom_point() +
  labs(
        title = "Income Influence on HIV Incidence",
        x = "Average income of each neighborhood",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      )

ggplotly(income_plot)

Limitations:

We can not visualize the effect of age and race at the same time.

3 regression

Hiv diagnoses

income_hiv %>% 
  filter(year != "2011" & age != "All") %>%
  lm(hiv_diagnoses ~ borough + gender + age + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 0.983 0.302 3.252 0.001
boroughBrooklyn 0.297 0.281 1.060 0.289
boroughManhattan 3.091 0.331 9.332 0.000
boroughQueens -1.245 0.259 -4.811 0.000
boroughStaten Island -4.376 0.397 -11.016 0.000
genderMale 6.083 0.152 40.138 0.000
age20 - 29 9.600 0.262 36.576 0.000
age30 - 39 6.870 0.262 26.175 0.000
age40 - 49 4.627 0.262 17.627 0.000
age50 - 59 2.355 0.262 8.972 0.000
age60+ 0.427 0.262 1.626 0.104
mid_income 0.000 0.000 -17.851 0.000
income_hiv %>% 
  filter(year != "2011" & race != "All") %>%
  lm(hiv_diagnoses ~ borough + gender + race + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 1.514 0.514 2.945 0.003
boroughBrooklyn 0.357 0.493 0.724 0.469
boroughManhattan 3.710 0.582 6.376 0.000
boroughQueens -1.494 0.454 -3.287 0.001
boroughStaten Island -5.251 0.698 -7.527 0.000
genderMale 7.299 0.266 27.425 0.000
raceBlack 10.932 0.421 25.978 0.000
raceLatino/Hispanic 9.027 0.421 21.451 0.000
raceOther/Unknown -1.380 0.421 -3.278 0.001
raceWhite 3.628 0.421 8.621 0.000
mid_income 0.000 0.000 -12.197 0.000
income_plot = income_hiv %>% 
  filter(year != "2011") %>% 
  group_by(uhf, year) %>% 
  summarise(sum_hiv = mean(hiv_diagnoses), mid_in = median(mid_income)) %>% 
  ggplot(aes(x = mid_in, y = sum_hiv, color = year)) +
  geom_point() + 
  geom_smooth(method = lm) +
  theme_bw() +
  theme(legend.position = "None")
ggplotly(income_plot)

Income distribution in different neighborhood

income_dist = income_hiv %>% 
  ggplot(aes(y = mid_income, x = uhf)) +
  geom_point(alpha = 0.1) +
  coord_flip() +
  theme_bw()
ggplotly(income_dist)         

HIV diagnosis rate

income_hiv %>% 
  filter(year != "2011" & age != "All") %>%
  lm(hiv_diagnosis_rate ~ borough + gender + age + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 17.851 1.511 11.811 0.000
boroughBrooklyn -12.078 1.403 -8.611 0.000
boroughManhattan 15.424 1.656 9.316 0.000
boroughQueens -22.620 1.293 -17.492 0.000
boroughStaten Island -30.524 1.985 -15.377 0.000
genderMale 39.822 0.757 52.582 0.000
age20 - 29 44.060 1.312 33.589 0.000
age30 - 39 33.215 1.312 25.322 0.000
age40 - 49 27.986 1.312 21.336 0.000
age50 - 59 13.841 1.312 10.552 0.000
age60+ -4.261 1.312 -3.249 0.001
mid_income -0.001 0.000 -18.326 0.000
income_hiv %>% 
  filter(year != "2011" & race != "All") %>%
  lm(hiv_diagnosis_rate ~ borough + gender + race + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 0.795 2.459 0.323 0.747
boroughBrooklyn -11.951 2.357 -5.070 0.000
boroughManhattan 22.135 2.783 7.955 0.000
boroughQueens -25.218 2.173 -11.603 0.000
boroughStaten Island -31.251 3.336 -9.367 0.000
genderMale 49.480 1.273 38.875 0.000
raceBlack 61.810 2.012 30.713 0.000
raceLatino/Hispanic 34.404 2.012 17.095 0.000
raceOther/Unknown 9.809 2.012 4.874 0.000
raceWhite 12.984 2.012 6.452 0.000
mid_income 0.000 0.000 -1.729 0.084
income_plot_diag_rate = income_hiv %>% 
  filter(year != "2011") %>% 
  group_by(uhf, year) %>% 
  summarise(sum_hiv_diagnosis_rate = sum(hiv_diagnosis_rate), mid_in = median(mid_income)) %>% 
  ggplot(aes(x = mid_in, y = sum_hiv_diagnosis_rate, color = year)) +
  geom_point() + 
  geom_smooth(method = lm) +
  theme_bw() +
  theme(legend.position = "None")
ggplotly(income_plot_diag_rate)

4 map plot

map_data diagnoses

diagnoses_by_zipcode =  
  combine_hiv_income %>%
  filter(gender == "All", age == "All", race == "All") %>% 
  mutate(zipcode = factor(zipcode)) %>% 
  group_by(zipcode, uhf) %>% 
  summarise(sum_diagnoses = sum(hiv_diagnoses), sum_diagnosis_rate = sum(hiv_diagnosis_rate))
  
map_data = geo_join(nyc_zipcode, diagnoses_by_zipcode, "postalCode", "zipcode")

point

points_spdf = combine_hiv1 %>% 
  filter(!is.na(lng))
  
coordinates(points_spdf) = ~lng + lat
proj4string(points_spdf) = proj4string(nyc_zipcode)
matches = over(points_spdf, nyc_zipcode)

points = full_join(matches, rename(diagnoses_by_zipcode, postalCode = zipcode), by = "postalCode")
## Warning: Column `postalCode` joining factors with different levels,
## coercing to character vector

map1 diagnoses

pal1 = colorNumeric(palette = "Reds", domain = range(map_data@data$sum_diagnoses, na.rm = T))
# "BuPu" "viridis" "Greens""inferno"

leaflet(map_data) %>%
  addTiles() %>%  
  addPolygons(color = "black", weight = 1,
              fillColor = ~pal1(sum_diagnoses), fillOpacity = 0.8, 
              popup = ~stringr::str_c(uhf, "  sum:", factor(sum_diagnoses))) %>% 
  addMarkers(~longitude, ~latitude,
             popup = ~stringr::str_c(uhf, "  sum:", factor(sum_diagnoses)), data = points) %>%  
  addProviderTiles("CartoDB.Positron") %>%
  addLegend(position = "bottomright", pal = pal1, values = ~sum_diagnoses,
            title = "The amount of HIV diagnoses", opacity = 0.8) %>% 
  setView(-73.98, 40.75, zoom = 11)
## Warning in validateCoords(lng, lat, funcName): Data contains 127 rows with
## either missing or invalid lat/lon values and will be ignored

map2 diagnoses_rate

pal2 = colorNumeric(palette = "Reds", domain = range(map_data@data$sum_diagnosis_rate, na.rm = T))
# "BuPu" "viridis" "Greens""inferno"

leaflet(map_data) %>%
  addTiles() %>%  
  addPolygons(color = "black", weight = 1,
              fillColor = ~pal1(sum_diagnosis_rate), fillOpacity = 0.8, 
              popup = ~stringr::str_c(uhf, "  sum:", factor(sum_diagnosis_rate))) %>% 
  addMarkers(~longitude, ~latitude,
             popup = ~stringr::str_c(uhf, "  sum:", factor(sum_diagnosis_rate)), data = points) %>%  
  addProviderTiles("CartoDB.Positron") %>%
  addLegend(position = "bottomright", pal = pal2, values = ~sum_diagnosis_rate,
            title = "The amount of HIV diagnosis rate", opacity = 0.8) %>% 
  setView(-73.98, 40.75, zoom = 11)
## Warning in validateCoords(lng, lat, funcName): Data contains 127 rows with
## either missing or invalid lat/lon values and will be ignored

map_data_income

income_by_zipcode = 
  combine_hiv_income %>% 
  filter(year != "2011") %>% 
  filter(gender == "All", age == "All", race == "All") %>% 
  group_by(zipcode, uhf) %>% 
  summarise(mean_income = mean(mid_income))
  
map_data_income = geo_join(nyc_zipcode, income_by_zipcode, "postalCode", "zipcode")

map3

pal3 = colorNumeric(palette = "Greens", domain = range(map_data_income@data$mean_income, na.rm = T))

leaflet(map_data_income) %>%
  addTiles() %>%  
  addPolygons(color = "black", weight = 1,
              fillColor = ~pal3(mean_income), fillOpacity = 0.8, 
              popup = ~stringr::str_c(uhf, "  sum:", factor(mean_income))) %>% 
  addProviderTiles("CartoDB.Positron") %>%
  addLegend(position = "bottomright", pal = pal3, values = ~mean_income,
            title = "Mean income of each uhf", opacity = 0.8) %>% 
  setView(-73.98, 40.75, zoom = 11)